#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
const char* conv_2d_c16_subgroup_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n"
"#ifdef MNN_SUPPORT_FP16\n"
"#define GROUP_READ(ptr,offset) as_half(intel_sub_group_block_read_us((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_READ2(ptr,offset) as_half2(intel_sub_group_block_read_us2((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_READ4(ptr,offset) as_half4(intel_sub_group_block_read_us4((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_READ8(ptr,offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_WRITE(ptr,offset,val) intel_sub_group_block_write_us((const __global ushort*)(ptr)+(offset),as_ushort(val))\n"
"#define GROUP_WRITE2(ptr,offset,val) intel_sub_group_block_write_us2((const __global ushort*)(ptr)+(offset),as_ushort2(val))\n"
"#define GROUP_WRITE4(ptr,offset,val) intel_sub_group_block_write_us4((const __global ushort*)(ptr)+(offset),as_ushort4(val))\n"
"#define GROUP_WRITE8(ptr,offset,val) intel_sub_group_block_write_us8((const __global ushort*)(ptr)+(offset),as_ushort8(val))\n"
"#define GROUP_SHUFFLE(data,id) as_half(intel_sub_group_shuffle(as_ushort(data),id))\n"
"#define GROUP_SHUFFLE2(data,id) as_half2(intel_sub_group_shuffle(as_ushort2(data),id))\n"
"#define GROUP_SHUFFLE4(data,id) as_half4(intel_sub_group_shuffle(as_ushort4(data),id))\n"
"#define GROUP_SHUFFLE8(data,id) as_half8(intel_sub_group_shuffle(as_ushort8(data),id))\n"
"#else\n"
"#define GROUP_READ(ptr,offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_READ2(ptr,offset) as_float2(intel_sub_group_block_read2((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_READ4(ptr,offset) as_float4(intel_sub_group_block_read4((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_READ8(ptr,offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_WRITE(ptr,offset,val) intel_sub_group_block_write((const __global uint*)(ptr)+(offset),as_uint(val))\n"
"#define GROUP_WRITE2(ptr,offset,val) intel_sub_group_block_write2((const __global uint*)(ptr)+(offset),as_uint2(val))\n"
"#define GROUP_WRITE4(ptr,offset,val) intel_sub_group_block_write4((const __global uint*)(ptr)+(offset),as_uint4(val))\n"
"#define GROUP_WRITE8(ptr,offset,val) intel_sub_group_block_write8((const __global uint*)(ptr)+(offset),as_uint8(val))\n"
"#define GROUP_SHUFFLE(data,id) intel_sub_group_shuffle(data,id)\n"
"#define GROUP_SHUFFLE2(data,id) intel_sub_group_shuffle(data,id)\n"
"#define GROUP_SHUFFLE4(data,id) intel_sub_group_shuffle(data,id)\n"
"#define GROUP_SHUFFLE8(data,id) intel_sub_group_shuffle(data,id)\n"
"#endif\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c16_c4_b2(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
") {\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=(uint)get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 1;\n"
" const int y=(xy/x_blocks);\n"
" const int lid1=(int)get_local_id(1);\n"
" const int feature_per_wg=(int)get_local_size(1)/SLM_DIV_FACTOR;\n"
" const int feature_sub_block=lid1/feature_per_wg;\n"
" const int feature_block=(int)get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(input_pad_left+input_width+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(input_height);\n"
" const uint input_b_pitch=input_fs_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*output_width;\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*batch;\n"
" const uint output_offset=b*output_fs_pitch +\n"
" (feature_block << 2)*output_b_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=16*16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=feature_block*filter_os_pitch;\n"
"#if SLM_DIV_FACTOR == 1\n"
" COMPUTE_FLOAT2 dst=(COMPUTE_FLOAT2)((GROUP_READ(biases,feature_block*16)));\n"
"#else\n"
" COMPUTE_FLOAT2 dst;\n"
" if (feature_sub_block == 0) {\n"
" dst=(COMPUTE_FLOAT2)((GROUP_READ(biases,feature_block*16)));\n"
" } else {\n"
" dst=(COMPUTE_FLOAT2)0;\n"
" }\n"
"#endif \n"
"#if SLM_DIV_FACTOR>1\n"
" __local COMPUTE_FLOAT2 sum[WORK_GROUP_SIZE];\n"
"#endif\n"
"#if SLM_DIV_FACTOR>1\n"
" for (int icb=feature_sub_block*IC_BLOCKS/SLM_DIV_FACTOR; icb<(feature_sub_block+1)*IC_BLOCKS/SLM_DIV_FACTOR; icb++) {\n"
"#else\n"
" for (int icb=0; icb<IC_BLOCKS; icb++) {\n"
"#endif \n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++) {\n"
" if (input_y+kh*DILATION_HEIGHT<0 || input_y+kh*DILATION_HEIGHT >= input_height)\n"
" continue;\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" {\n"
" int xb=0;\n"
" for (; xb+8 <= INPUT_LINE_SIZE; xb += 8) {\n"
" COMPUTE_FLOAT8 tmp=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" line_cache[xb+4]=tmp[4];\n"
" line_cache[xb+5]=tmp[5];\n"
" line_cache[xb+6]=tmp[6];\n"
" line_cache[xb+7]=tmp[7];\n"
" }\n"
" for (; xb+4 <= INPUT_LINE_SIZE; xb += 4) {\n"
" COMPUTE_FLOAT4 tmp=CONVERT_COMPUTE_FLOAT4(GROUP_READ4(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" }\n"
" for (; xb<INPUT_LINE_SIZE; xb++) {\n"
" line_cache[xb]=GROUP_READ(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch);\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++) {\n"
" FLOAT2 src;\n"
" __attribute__((opencl_unroll_hint(2)))\n"
" for (int i=0; i<2; i++) {\n"
"#if FILTER_WIDTH == 1 && DILATION_WIDTH == 1 && STRIDE_WIDTH == 1\n"
" src[i]=line_cache[i];\n"
"#else\n"
" src[i]=line_cache[kw*DILATION_WIDTH+STRIDE_WIDTH*i];\n"
"#endif\n"
" }\n"
" COMPUTE_FLOAT8 weight0=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch));\n"
" COMPUTE_FLOAT8 weight1=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch +\n"
" 8*filter_isv_pitch));\n"
" const COMPUTE_FLOAT2 src0=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,0));\n"
" const COMPUTE_FLOAT2 src1=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,1));\n"
" const COMPUTE_FLOAT2 src2=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,2));\n"
" const COMPUTE_FLOAT2 src3=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,3));\n"
" const COMPUTE_FLOAT2 src4=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,4));\n"
" const COMPUTE_FLOAT2 src5=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,5));\n"
" const COMPUTE_FLOAT2 src6=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,6));\n"
" const COMPUTE_FLOAT2 src7=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,7));\n"
" const COMPUTE_FLOAT2 src8=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,8));\n"
" const COMPUTE_FLOAT2 src9=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,9));\n"
" const COMPUTE_FLOAT2 src10=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,10));\n"
" const COMPUTE_FLOAT2 src11=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,11));\n"
" const COMPUTE_FLOAT2 src12=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,12));\n"
" const COMPUTE_FLOAT2 src13=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,13));\n"
" const COMPUTE_FLOAT2 src14=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,14));\n"
" const COMPUTE_FLOAT2 src15=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,15));\n"
" dst=mad(weight0.s0,src0,dst);\n"
" dst=mad(weight0.s1,src1,dst);\n"
" dst=mad(weight0.s2,src2,dst);\n"
" dst=mad(weight0.s3,src3,dst);\n"
" dst=mad(weight0.s4,src4,dst);\n"
" dst=mad(weight0.s5,src5,dst);\n"
" dst=mad(weight0.s6,src6,dst);\n"
" dst=mad(weight0.s7,src7,dst);\n"
" dst=mad(weight1.s0,src8,dst);\n"
" dst=mad(weight1.s1,src9,dst);\n"
" dst=mad(weight1.s2,src10,dst);\n"
" dst=mad(weight1.s3,src11,dst);\n"
" dst=mad(weight1.s4,src12,dst);\n"
" dst=mad(weight1.s5,src13,dst);\n"
" dst=mad(weight1.s6,src14,dst);\n"
" dst=mad(weight1.s7,src15,dst);\n"
" }\n"
" }\n"
" }\n"
" \n"
"#if SLM_DIV_FACTOR>1\n"
" sum[lid1]=dst;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if (feature_sub_block == 0) {\n"
" __attribute__((opencl_unroll_hint)) for(int i=1; i<SLM_DIV_FACTOR; i++)\n"
" dst += sum[lid1 % feature_per_wg+i*feature_per_wg];\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT2)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
"#endif\n"
" const uint lid_x=sglid % 4;\n"
" const uint lid_y=sglid/4;\n"
" if ((feature_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
" if ((feature_block*16+lid_y*4+lid_x<output_channel))\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" }\n"
"#endif\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c16_c4_b4(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
") {\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=(uint)get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 2;\n"
" const int y=(xy/x_blocks);\n"
" const int lid1=(int)get_local_id(1);\n"
" const int feature_per_wg=(int)get_local_size(1)/SLM_DIV_FACTOR;\n"
" const int feature_sub_block=lid1/feature_per_wg;\n"
" const int feature_block=(int)get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(input_pad_left+input_width+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(input_height);\n"
" const uint input_b_pitch=input_fs_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*output_width;\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*batch;\n"
" const uint output_offset=b*output_fs_pitch +\n"
" (feature_block << 2)*output_b_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=16*16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=feature_block*filter_os_pitch;\n"
"#if SLM_DIV_FACTOR == 1\n"
" COMPUTE_FLOAT4 dst=(COMPUTE_FLOAT4)((GROUP_READ(biases,feature_block*16)));\n"
"#else\n"
" COMPUTE_FLOAT4 dst;\n"
" if (feature_sub_block == 0) {\n"
" dst=(COMPUTE_FLOAT4)((GROUP_READ(biases,feature_block*16)));\n"
" } else {\n"
" dst=(COMPUTE_FLOAT4)0;\n"
" }\n"
"#endif \n"
"#if SLM_DIV_FACTOR>1\n"
" __local COMPUTE_FLOAT4 sum[WORK_GROUP_SIZE];\n"
"#endif\n"
"#if SLM_DIV_FACTOR>1\n"
" for (int icb=feature_sub_block*IC_BLOCKS/SLM_DIV_FACTOR; icb<(feature_sub_block+1)*IC_BLOCKS/SLM_DIV_FACTOR; icb++) {\n"
"#else\n"
" for (int icb=0; icb<IC_BLOCKS; icb++) {\n"
"#endif \n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++) {\n"
" if (input_y+kh*DILATION_HEIGHT<0 || input_y+kh*DILATION_HEIGHT >= input_height)\n"
" continue;\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" {\n"
" int xb=0;\n"
" for (; xb+8 <= INPUT_LINE_SIZE; xb += 8) {\n"
" COMPUTE_FLOAT8 tmp=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" line_cache[xb+4]=tmp[4];\n"
" line_cache[xb+5]=tmp[5];\n"
" line_cache[xb+6]=tmp[6];\n"
" line_cache[xb+7]=tmp[7];\n"
" }\n"
" for (; xb+4 <= INPUT_LINE_SIZE; xb += 4) {\n"
" COMPUTE_FLOAT4 tmp=CONVERT_COMPUTE_FLOAT4(GROUP_READ4(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" }\n"
" for (; xb<INPUT_LINE_SIZE; xb++) {\n"
" line_cache[xb]=(COMPUTE_FLOAT)GROUP_READ(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch);\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++) {\n"
" FLOAT4 src;\n"
" __attribute__((opencl_unroll_hint(4)))\n"
" for (int i=0; i<4; i++) {\n"
"#if FILTER_WIDTH == 1 && DILATION_WIDTH == 1 && STRIDE_WIDTH == 1\n"
" src[i]=line_cache[i];\n"
"#else\n"
" src[i]=line_cache[kw*DILATION_WIDTH+STRIDE_WIDTH*i];\n"
"#endif\n"
" }\n"
" COMPUTE_FLOAT8 weight0=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch));\n"
" COMPUTE_FLOAT8 weight1=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch +\n"
" 8*filter_isv_pitch));\n"
" const COMPUTE_FLOAT4 src0=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,0));\n"
" const COMPUTE_FLOAT4 src1=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,1));\n"
" const COMPUTE_FLOAT4 src2=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,2));\n"
" const COMPUTE_FLOAT4 src3=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,3));\n"
" const COMPUTE_FLOAT4 src4=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,4));\n"
" const COMPUTE_FLOAT4 src5=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,5));\n"
" const COMPUTE_FLOAT4 src6=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,6));\n"
" const COMPUTE_FLOAT4 src7=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,7));\n"
" const COMPUTE_FLOAT4 src8=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,8));\n"
" const COMPUTE_FLOAT4 src9=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,9));\n"
" const COMPUTE_FLOAT4 src10=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,10));\n"
" const COMPUTE_FLOAT4 src11=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,11));\n"
" const COMPUTE_FLOAT4 src12=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,12));\n"
" const COMPUTE_FLOAT4 src13=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,13));\n"
" const COMPUTE_FLOAT4 src14=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,14));\n"
" const COMPUTE_FLOAT4 src15=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,15));\n"
" dst=mad(weight0.s0,src0,dst);\n"
" dst=mad(weight0.s1,src1,dst);\n"
" dst=mad(weight0.s2,src2,dst);\n"
" dst=mad(weight0.s3,src3,dst);\n"
" dst=mad(weight0.s4,src4,dst);\n"
" dst=mad(weight0.s5,src5,dst);\n"
" dst=mad(weight0.s6,src6,dst);\n"
" dst=mad(weight0.s7,src7,dst);\n"
" dst=mad(weight1.s0,src8,dst);\n"
" dst=mad(weight1.s1,src9,dst);\n"
" dst=mad(weight1.s2,src10,dst);\n"
" dst=mad(weight1.s3,src11,dst);\n"
" dst=mad(weight1.s4,src12,dst);\n"
" dst=mad(weight1.s5,src13,dst);\n"
" dst=mad(weight1.s6,src14,dst);\n"
" dst=mad(weight1.s7,src15,dst);\n"
" }\n"
" }\n"
" }\n"
" \n"
"#if SLM_DIV_FACTOR>1\n"
" sum[lid1]=dst;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if (feature_sub_block == 0) {\n"
" __attribute__((opencl_unroll_hint)) for(int i=1; i<SLM_DIV_FACTOR; i++)\n"
" dst += sum[lid1 % feature_per_wg+i*feature_per_wg];\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const uint lid_x=sglid % 4;\n"
" const uint lid_y=sglid/4;\n"
" if ((feature_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
" if ((feature_block*16+lid_y*4+lid_x<output_channel))\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" }\n"
"#endif\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c16_c4_b8(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
") {\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=(uint)get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 3;\n"
" const int y=(xy/x_blocks);\n"
" const int lid1=(int)get_local_id(1);\n"
" const int feature_per_wg=(int)get_local_size(1)/SLM_DIV_FACTOR;\n"
" const int feature_sub_block=lid1/feature_per_wg;\n"
" const int feature_block=(int)get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(input_pad_left+input_width+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(input_height);\n"
" const uint input_b_pitch=input_fs_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*output_width;\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*batch;\n"
" const uint output_offset=b*output_fs_pitch +\n"
" (feature_block << 2)*output_b_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=16*16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=feature_block*filter_os_pitch;\n"
"#if SLM_DIV_FACTOR == 1\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(GROUP_READ(biases,feature_block*16));\n"
"#else\n"
" COMPUTE_FLOAT8 dst;\n"
" if (feature_sub_block == 0) {\n"
" dst=(COMPUTE_FLOAT8)(GROUP_READ(biases,feature_block*16));\n"
" } else {\n"
" dst=(COMPUTE_FLOAT8)0;\n"
" }\n"
"#endif \n"
"#if SLM_DIV_FACTOR>1\n"
" __local COMPUTE_FLOAT8 sum[WORK_GROUP_SIZE];\n"
"#endif\n"
"#if SLM_DIV_FACTOR>1\n"
" for (int icb=feature_sub_block*IC_BLOCKS/SLM_DIV_FACTOR; icb<(feature_sub_block+1)*IC_BLOCKS/SLM_DIV_FACTOR; icb++) {\n"
"#else\n"
" for (int icb=0; icb<IC_BLOCKS; icb++) {\n"
"#endif \n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++) {\n"
" if (input_y+kh*DILATION_HEIGHT<0 || input_y+kh*DILATION_HEIGHT >= input_height)\n"
" continue;\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" {\n"
" int xb=0;\n"
" for (; xb+8 <= INPUT_LINE_SIZE; xb += 8) {\n"
" COMPUTE_FLOAT8 tmp=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" line_cache[xb+4]=tmp[4];\n"
" line_cache[xb+5]=tmp[5];\n"
" line_cache[xb+6]=tmp[6];\n"
" line_cache[xb+7]=tmp[7];\n"
" }\n"
" for (; xb+4 <= INPUT_LINE_SIZE; xb += 4) {\n"
" COMPUTE_FLOAT4 tmp=CONVERT_COMPUTE_FLOAT4(GROUP_READ4(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" }\n"
" for (; xb<INPUT_LINE_SIZE; xb++) {\n"
" line_cache[xb]=(COMPUTE_FLOAT)GROUP_READ(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch);\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++) {\n"
" FLOAT8 src;\n"
" __attribute__((opencl_unroll_hint(8)))\n"
" for (int i=0; i<8; i++) {\n"
"#if FILTER_WIDTH == 1 && DILATION_WIDTH == 1 && STRIDE_WIDTH == 1\n"
" src[i]=line_cache[i];\n"
"#else\n"
" src[i]=line_cache[kw*DILATION_WIDTH+STRIDE_WIDTH*i];\n"
"#endif\n"
" }\n"
" COMPUTE_FLOAT8 weight0=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch));\n"
" COMPUTE_FLOAT8 weight1=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch +\n"
" 8*filter_isv_pitch));\n"
" const COMPUTE_FLOAT8 src0=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,0));\n"
" const COMPUTE_FLOAT8 src1=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,1));\n"
" const COMPUTE_FLOAT8 src2=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,2));\n"
" const COMPUTE_FLOAT8 src3=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,3));\n"
" const COMPUTE_FLOAT8 src4=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,4));\n"
" const COMPUTE_FLOAT8 src5=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,5));\n"
" const COMPUTE_FLOAT8 src6=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,6));\n"
" const COMPUTE_FLOAT8 src7=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,7));\n"
" const COMPUTE_FLOAT8 src8=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,8));\n"
" const COMPUTE_FLOAT8 src9=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,9));\n"
" const COMPUTE_FLOAT8 src10=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,10));\n"
" const COMPUTE_FLOAT8 src11=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,11));\n"
" const COMPUTE_FLOAT8 src12=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,12));\n"
" const COMPUTE_FLOAT8 src13=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,13));\n"
" const COMPUTE_FLOAT8 src14=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,14));\n"
" const COMPUTE_FLOAT8 src15=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,15));\n"
" dst=mad(weight0.s0,src0,dst);\n"
" dst=mad(weight0.s1,src1,dst);\n"
" dst=mad(weight0.s2,src2,dst);\n"
" dst=mad(weight0.s3,src3,dst);\n"
" dst=mad(weight0.s4,src4,dst);\n"
" dst=mad(weight0.s5,src5,dst);\n"
" dst=mad(weight0.s6,src6,dst);\n"
" dst=mad(weight0.s7,src7,dst);\n"
" dst=mad(weight1.s0,src8,dst);\n"
" dst=mad(weight1.s1,src9,dst);\n"
" dst=mad(weight1.s2,src10,dst);\n"
" dst=mad(weight1.s3,src11,dst);\n"
" dst=mad(weight1.s4,src12,dst);\n"
" dst=mad(weight1.s5,src13,dst);\n"
" dst=mad(weight1.s6,src14,dst);\n"
" dst=mad(weight1.s7,src15,dst);\n"
" }\n"
" }\n"
" }\n"
" \n"
"#if SLM_DIV_FACTOR>1\n"
" sum[lid1]=dst;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if (feature_sub_block == 0) {\n"
" __attribute__((opencl_unroll_hint)) for(int i=1; i<SLM_DIV_FACTOR; i++)\n"
" dst += sum[lid1 % feature_per_wg+i*feature_per_wg];\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" const uint lid_x=sglid % 4;\n"
" const uint lid_y=sglid/4;\n"
" if ((feature_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
" if ((feature_block*16+lid_y*4+lid_x<output_channel))\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" }\n"
"#endif\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c16_c16_b2(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
") {\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=(uint)get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 1;\n"
" const int y=(xy/x_blocks);\n"
" const int lid1=(int)get_local_id(1);\n"
" const int feature_per_wg=(int)get_local_size(1)/SLM_DIV_FACTOR;\n"
" const int feature_sub_block=lid1/feature_per_wg;\n"
" const int feature_block=(int)get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(input_pad_left+input_width+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(input_height);\n"
" const uint input_b_pitch=input_fs_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(output_pad_left+output_width+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*((output_channel+15)/16);\n"
" const uint output_offset=b*output_b_pitch +\n"
" feature_block*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=16*16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=feature_block*filter_os_pitch;\n"
"#if SLM_DIV_FACTOR == 1\n"
" COMPUTE_FLOAT2 dst=(COMPUTE_FLOAT2)(GROUP_READ(biases,feature_block*16));\n"
"#else\n"
" COMPUTE_FLOAT2 dst;\n"
" if (feature_sub_block == 0) {\n"
" dst=(COMPUTE_FLOAT2)(GROUP_READ(biases,feature_block*16));\n"
" } else {\n"
" dst=(COMPUTE_FLOAT2)0;\n"
" }\n"
"#endif \n"
"#if SLM_DIV_FACTOR>1\n"
" __local COMPUTE_FLOAT2 sum[WORK_GROUP_SIZE];\n"
"#endif\n"
"#if SLM_DIV_FACTOR>1\n"
" for (int icb=feature_sub_block*IC_BLOCKS/SLM_DIV_FACTOR; icb<(feature_sub_block+1)*IC_BLOCKS/SLM_DIV_FACTOR; icb++) {\n"
"#else\n"
" for (int icb=0; icb<IC_BLOCKS; icb++) {\n"
"#endif \n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++) {\n"
" if (input_y+kh*DILATION_HEIGHT<0 || input_y+kh*DILATION_HEIGHT >= input_height)\n"
" continue;\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" {\n"
" int xb=0;\n"
" for (; xb+8 <= INPUT_LINE_SIZE; xb += 8) {\n"
" COMPUTE_FLOAT8 tmp=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" line_cache[xb+4]=tmp[4];\n"
" line_cache[xb+5]=tmp[5];\n"
" line_cache[xb+6]=tmp[6];\n"
" line_cache[xb+7]=tmp[7];\n"
" }\n"
" for (; xb+4 <= INPUT_LINE_SIZE; xb += 4) {\n"
" COMPUTE_FLOAT4 tmp=CONVERT_COMPUTE_FLOAT4(GROUP_READ4(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" }\n"
" for (; xb<INPUT_LINE_SIZE; xb++) {\n"
" line_cache[xb]=(COMPUTE_FLOAT)GROUP_READ(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch);\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++) {\n"
" FLOAT2 src;\n"
" __attribute__((opencl_unroll_hint(2)))\n"
" for (int i=0; i<2; i++) {\n"
"#if FILTER_WIDTH == 1 && DILATION_WIDTH == 1 && STRIDE_WIDTH == 1\n"
" src[i]=line_cache[i];\n"
"#else\n"
" src[i]=line_cache[kw*DILATION_WIDTH+STRIDE_WIDTH*i];\n"
"#endif\n"
" }\n"
" COMPUTE_FLOAT8 weight0=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch));\n"
" COMPUTE_FLOAT8 weight1=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch +\n"
" 8*filter_isv_pitch));\n"
" const COMPUTE_FLOAT2 src0=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,0));\n"
" const COMPUTE_FLOAT2 src1=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,1));\n"
" const COMPUTE_FLOAT2 src2=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,2));\n"
" const COMPUTE_FLOAT2 src3=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,3));\n"
" const COMPUTE_FLOAT2 src4=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,4));\n"
" const COMPUTE_FLOAT2 src5=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,5));\n"
" const COMPUTE_FLOAT2 src6=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,6));\n"
" const COMPUTE_FLOAT2 src7=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,7));\n"
" const COMPUTE_FLOAT2 src8=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,8));\n"
" const COMPUTE_FLOAT2 src9=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,9));\n"
" const COMPUTE_FLOAT2 src10=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,10));\n"
" const COMPUTE_FLOAT2 src11=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,11));\n"
" const COMPUTE_FLOAT2 src12=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,12));\n"
" const COMPUTE_FLOAT2 src13=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,13));\n"
" const COMPUTE_FLOAT2 src14=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,14));\n"
" const COMPUTE_FLOAT2 src15=CONVERT_COMPUTE_FLOAT2(GROUP_SHUFFLE2(src,15));\n"
" dst=mad(weight0.s0,src0,dst);\n"
" dst=mad(weight0.s1,src1,dst);\n"
" dst=mad(weight0.s2,src2,dst);\n"
" dst=mad(weight0.s3,src3,dst);\n"
" dst=mad(weight0.s4,src4,dst);\n"
" dst=mad(weight0.s5,src5,dst);\n"
" dst=mad(weight0.s6,src6,dst);\n"
" dst=mad(weight0.s7,src7,dst);\n"
" dst=mad(weight1.s0,src8,dst);\n"
" dst=mad(weight1.s1,src9,dst);\n"
" dst=mad(weight1.s2,src10,dst);\n"
" dst=mad(weight1.s3,src11,dst);\n"
" dst=mad(weight1.s4,src12,dst);\n"
" dst=mad(weight1.s5,src13,dst);\n"
" dst=mad(weight1.s6,src14,dst);\n"
" dst=mad(weight1.s7,src15,dst);\n"
" }\n"
" }\n"
" }\n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+feature_block*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" pad_offset += (output_width+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" }\n"
" \n"
"#if SLM_DIV_FACTOR>1\n"
" sum[lid1]=dst;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if (feature_sub_block == 0) {\n"
" __attribute__((opencl_unroll_hint)) for(int i=1; i<SLM_DIV_FACTOR; i++)\n"
" dst += sum[lid1 % feature_per_wg+i*feature_per_wg];\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT2)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
"#endif\n"
" if ((feature_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<2; i++) {\n"
" if ((feature_block*16+sglid<output_channel) && (x+i)<output_width)\n"
" output[output_offset+i*output_x_pitch+sglid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (x+2 <= output_width || output_width % 2 == 0) {\n"
" GROUP_WRITE2(output,output_offset,CONVERT_FLOAT2(dst));\n"
" }else{\n"
" for (int i=0; i<output_width % 2; i++) {\n"
" output[output_offset+i*output_x_pitch+sglid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" }\n"
"#endif\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c16_c16_b4(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
") {\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=(uint)get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 2;\n"
" const int y=(xy/x_blocks);\n"
" const int lid1=(int)get_local_id(1);\n"
" const int feature_per_wg=(int)get_local_size(1)/SLM_DIV_FACTOR;\n"
" const int feature_sub_block=lid1/feature_per_wg;\n"
" const int feature_block=(int)get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(input_pad_left+input_width+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(input_height);\n"
" const uint input_b_pitch=input_fs_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(output_pad_left+output_width+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*((output_channel+15)/16);\n"
" const uint output_offset=b*output_b_pitch +\n"
" feature_block*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=16*16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=feature_block*filter_os_pitch;\n"
"#if SLM_DIV_FACTOR == 1\n"
" COMPUTE_FLOAT4 dst=(COMPUTE_FLOAT4)(GROUP_READ(biases,feature_block*16));\n"
"#else\n"
" COMPUTE_FLOAT4 dst;\n"
" if (feature_sub_block == 0) {\n"
" dst=(COMPUTE_FLOAT4)(GROUP_READ(biases,feature_block*16));\n"
" } else {\n"
" dst=(COMPUTE_FLOAT4)0;\n"
" }\n"
"#endif \n"
"#if SLM_DIV_FACTOR>1\n"
" __local COMPUTE_FLOAT4 sum[WORK_GROUP_SIZE];\n"
"#endif\n"
"#if SLM_DIV_FACTOR>1\n"
" for (int icb=feature_sub_block*IC_BLOCKS/SLM_DIV_FACTOR; icb<(feature_sub_block+1)*IC_BLOCKS/SLM_DIV_FACTOR; icb++) {\n"
"#else\n"
" for (int icb=0; icb<IC_BLOCKS; icb++) {\n"
"#endif \n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++) {\n"
" if (input_y+kh*DILATION_HEIGHT<0 || input_y+kh*DILATION_HEIGHT >= input_height)\n"
" continue;\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" {\n"
" int xb=0;\n"
" for (; xb+8 <= INPUT_LINE_SIZE; xb += 8) {\n"
" COMPUTE_FLOAT8 tmp=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" line_cache[xb+4]=tmp[4];\n"
" line_cache[xb+5]=tmp[5];\n"
" line_cache[xb+6]=tmp[6];\n"
" line_cache[xb+7]=tmp[7];\n"
" }\n"
" for (; xb+4 <= INPUT_LINE_SIZE; xb += 4) {\n"
" COMPUTE_FLOAT4 tmp=CONVERT_COMPUTE_FLOAT4(GROUP_READ4(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" }\n"
" for (; xb<INPUT_LINE_SIZE; xb++) {\n"
" line_cache[xb]=(COMPUTE_FLOAT)GROUP_READ(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch);\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++) {\n"
" FLOAT4 src;\n"
" __attribute__((opencl_unroll_hint(4)))\n"
" for (int i=0; i<4; i++) {\n"
"#if FILTER_WIDTH == 1 && DILATION_WIDTH == 1 && STRIDE_WIDTH == 1\n"
" src[i]=line_cache[i];\n"
"#else\n"
" src[i]=line_cache[kw*DILATION_WIDTH+STRIDE_WIDTH*i];\n"
"#endif\n"
" }\n"
" COMPUTE_FLOAT8 weight0=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch));\n"
" COMPUTE_FLOAT8 weight1=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch +\n"
" 8*filter_isv_pitch));\n"
" const COMPUTE_FLOAT4 src0=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,0));\n"
" const COMPUTE_FLOAT4 src1=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,1));\n"
" const COMPUTE_FLOAT4 src2=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,2));\n"
" const COMPUTE_FLOAT4 src3=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,3));\n"
" const COMPUTE_FLOAT4 src4=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,4));\n"
" const COMPUTE_FLOAT4 src5=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,5));\n"
" const COMPUTE_FLOAT4 src6=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,6));\n"
" const COMPUTE_FLOAT4 src7=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,7));\n"
" const COMPUTE_FLOAT4 src8=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,8));\n"
" const COMPUTE_FLOAT4 src9=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,9));\n"
" const COMPUTE_FLOAT4 src10=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,10));\n"
" const COMPUTE_FLOAT4 src11=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,11));\n"
" const COMPUTE_FLOAT4 src12=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,12));\n"
" const COMPUTE_FLOAT4 src13=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,13));\n"
" const COMPUTE_FLOAT4 src14=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,14));\n"
" const COMPUTE_FLOAT4 src15=CONVERT_COMPUTE_FLOAT4(GROUP_SHUFFLE4(src,15));\n"
" dst=mad(weight0.s0,src0,dst);\n"
" dst=mad(weight0.s1,src1,dst);\n"
" dst=mad(weight0.s2,src2,dst);\n"
" dst=mad(weight0.s3,src3,dst);\n"
" dst=mad(weight0.s4,src4,dst);\n"
" dst=mad(weight0.s5,src5,dst);\n"
" dst=mad(weight0.s6,src6,dst);\n"
" dst=mad(weight0.s7,src7,dst);\n"
" dst=mad(weight1.s0,src8,dst);\n"
" dst=mad(weight1.s1,src9,dst);\n"
" dst=mad(weight1.s2,src10,dst);\n"
" dst=mad(weight1.s3,src11,dst);\n"
" dst=mad(weight1.s4,src12,dst);\n"
" dst=mad(weight1.s5,src13,dst);\n"
" dst=mad(weight1.s6,src14,dst);\n"
" dst=mad(weight1.s7,src15,dst);\n"
" }\n"
" }\n"
" }\n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+feature_block*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" pad_offset += (output_width+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" sum[lid1]=dst;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if (feature_sub_block == 0) {\n"
" __attribute__((opencl_unroll_hint)) for(int i=1; i<SLM_DIV_FACTOR; i++)\n"
" dst += sum[lid1 % feature_per_wg+i*feature_per_wg];\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" if ((feature_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<4; i++) {\n"
" if ((feature_block*16+sglid<output_channel) && (x+i)<output_width)\n"
" output[output_offset+i*output_x_pitch+sglid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (x+4 <= output_width || output_width % 4 == 0) {\n"
" GROUP_WRITE4(output,output_offset,CONVERT_FLOAT4(dst));\n"
" }else{\n"
" for (int i=0; i<output_width % 4; i++) {\n"
" output[output_offset+i*output_x_pitch+sglid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" }\n"
"#endif\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c16_c16_b8(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
") {\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=(uint)get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 3;\n"
" const int y=(xy/x_blocks);\n"
" const int lid1=(int)get_local_id(1);\n"
" const int feature_per_wg=(int)get_local_size(1)/SLM_DIV_FACTOR;\n"
" const int feature_sub_block=lid1/feature_per_wg;\n"
" const int feature_block=(int)get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(input_pad_left+input_width+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(input_height);\n"
" const uint input_b_pitch=input_fs_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(output_pad_left+output_width+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*((output_channel+15)/16);\n"
" const uint output_offset=b*output_b_pitch +\n"
" feature_block*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=16*16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=feature_block*filter_os_pitch;\n"
"#if SLM_DIV_FACTOR == 1\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(GROUP_READ(biases,feature_block*16));\n"
"#else\n"
" COMPUTE_FLOAT8 dst;\n"
" if (feature_sub_block == 0) {\n"
" dst=(COMPUTE_FLOAT8)(GROUP_READ(biases,feature_block*16));\n"
" } else {\n"
" dst=(COMPUTE_FLOAT8)0;\n"
" }\n"
"#endif \n"
"#if SLM_DIV_FACTOR>1\n"
" __local COMPUTE_FLOAT8 sum[WORK_GROUP_SIZE];\n"
"#endif\n"
"#if SLM_DIV_FACTOR>1\n"
" for (int icb=feature_sub_block*IC_BLOCKS/SLM_DIV_FACTOR; icb<(feature_sub_block+1)*IC_BLOCKS/SLM_DIV_FACTOR; icb++) {\n"
"#else\n"
" for (int icb=0; icb<IC_BLOCKS; icb++) {\n"
"#endif \n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++) {\n"
" if (input_y+kh*DILATION_HEIGHT<0 || input_y+kh*DILATION_HEIGHT >= input_height)\n"
" continue;\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" {\n"
" int xb=0;\n"
" for (; xb+8 <= INPUT_LINE_SIZE; xb += 8) {\n"
" COMPUTE_FLOAT8 tmp=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" line_cache[xb+4]=tmp[4];\n"
" line_cache[xb+5]=tmp[5];\n"
" line_cache[xb+6]=tmp[6];\n"
" line_cache[xb+7]=tmp[7];\n"
" }\n"
" for (; xb+4 <= INPUT_LINE_SIZE; xb += 4) {\n"
" COMPUTE_FLOAT4 tmp=CONVERT_COMPUTE_FLOAT4(GROUP_READ4(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch));\n"
" \n"
" line_cache[xb+0]=tmp[0];\n"
" line_cache[xb+1]=tmp[1];\n"
" line_cache[xb+2]=tmp[2];\n"
" line_cache[xb+3]=tmp[3];\n"
" }\n"
" for (; xb<INPUT_LINE_SIZE; xb++) {\n"
" line_cache[xb]=(COMPUTE_FLOAT)GROUP_READ(input,input_offset +\n"
" icb*input_fs_pitch +\n"
" kh*DILATION_HEIGHT*input_y_pitch +\n"
" xb*input_x_pitch);\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++) {\n"
" FLOAT8 src;\n"
" __attribute__((opencl_unroll_hint(8)))\n"
" for (int i=0; i<8; i++) {\n"
"#if FILTER_WIDTH == 1 && DILATION_WIDTH == 1 && STRIDE_WIDTH == 1\n"
" src[i]=line_cache[i];\n"
"#else\n"
" src[i]=line_cache[kw*DILATION_WIDTH+STRIDE_WIDTH*i];\n"
"#endif\n"
" }\n"
" COMPUTE_FLOAT8 weight0=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch));\n"
" COMPUTE_FLOAT8 weight1=CONVERT_COMPUTE_FLOAT8(GROUP_READ8(weights,filter_offset +\n"
" icb*filter_is_pitch +\n"
" kh*filter_y_pitch +\n"
" kw*filter_x_pitch +\n"
" 8*filter_isv_pitch));\n"
" const COMPUTE_FLOAT8 src0=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,0));\n"
" const COMPUTE_FLOAT8 src1=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,1));\n"
" const COMPUTE_FLOAT8 src2=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,2));\n"
" const COMPUTE_FLOAT8 src3=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,3));\n"
" const COMPUTE_FLOAT8 src4=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,4));\n"
" const COMPUTE_FLOAT8 src5=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,5));\n"
" const COMPUTE_FLOAT8 src6=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,6));\n"
" const COMPUTE_FLOAT8 src7=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,7));\n"
" const COMPUTE_FLOAT8 src8=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,8));\n"
" const COMPUTE_FLOAT8 src9=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,9));\n"
" const COMPUTE_FLOAT8 src10=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,10));\n"
" const COMPUTE_FLOAT8 src11=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,11));\n"
" const COMPUTE_FLOAT8 src12=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,12));\n"
" const COMPUTE_FLOAT8 src13=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,13));\n"
" const COMPUTE_FLOAT8 src14=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,14));\n"
" const COMPUTE_FLOAT8 src15=CONVERT_COMPUTE_FLOAT8(GROUP_SHUFFLE8(src,15));\n"
" dst=mad(weight0.s0,src0,dst);\n"
" dst=mad(weight0.s1,src1,dst);\n"
" dst=mad(weight0.s2,src2,dst);\n"
" dst=mad(weight0.s3,src3,dst);\n"
" dst=mad(weight0.s4,src4,dst);\n"
" dst=mad(weight0.s5,src5,dst);\n"
" dst=mad(weight0.s6,src6,dst);\n"
" dst=mad(weight0.s7,src7,dst);\n"
" dst=mad(weight1.s0,src8,dst);\n"
" dst=mad(weight1.s1,src9,dst);\n"
" dst=mad(weight1.s2,src10,dst);\n"
" dst=mad(weight1.s3,src11,dst);\n"
" dst=mad(weight1.s4,src12,dst);\n"
" dst=mad(weight1.s5,src13,dst);\n"
" dst=mad(weight1.s6,src14,dst);\n"
" dst=mad(weight1.s7,src15,dst);\n"
" }\n"
" }\n"
" }\n"
" \n"
" \n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+feature_block*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" pad_offset += (output_width+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" sum[lid1]=dst;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if (feature_sub_block == 0) {\n"
" __attribute__((opencl_unroll_hint)) for(int i=1; i<SLM_DIV_FACTOR; i++)\n"
" dst += sum[lid1 % feature_per_wg+i*feature_per_wg];\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" if ((feature_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<8; i++) {\n"
" if ((feature_block*16+sglid<output_channel) && (x+i)<output_width)\n"
" output[output_offset+i*output_x_pitch+sglid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (x+8 <= output_width || output_width % 8 == 0) {\n"
" GROUP_WRITE8(output,output_offset,CONVERT_FLOAT8(dst));\n"
" }else{\n"
" for (int i=0; i<output_width % 8; i++) {\n"
" output[output_offset+i*output_x_pitch+sglid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" }\n"
"#if SLM_DIV_FACTOR>1\n"
" }\n"
"#endif\n"
"}\n"
;
#endif
#endif
}
